/**********************************************************************/
/*
   Author: Michelle Han
   Created: 30 September, 2022
   Description: Merges and reshapes PMO data transfers:
		6. JPAL_DATA_9: baseline characteristics for newly added observations in batch 1-22

	 Outputs data at person level.
   Note that this code works with PII data (not deidentified).

   Note: to set filepaths, run MASTER.do.

   Output:
	 pmo_b1-22_raw_add_vars_baseline_wide.dta

*/
/**********************************************************************/

	* Set Filepaths
	if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
	* Log
	cap log close
	local prefix: display %tdCYND td(`c(current_date)')
	log using "$KP_logs/`prefix'_PMO_merge_wide_add_vars_baseline_b1-22.txt", replace text

/*----------------------------------------------------*/
            /* Section: Append batches */
/*----------------------------------------------------*/

* load first batch
	u "$KP_deid_admin/Raw/JPAL_DATA_9/final add baseline_1.dta", clear
	rename ticket_score bobot
	gen batch = 1

* loop to append each batch
	qui forval i = 2 / 22 {
		noi di "Appending Batch `i'"
		preserve
			u "$KP_deid_admin/Raw/JPAL_DATA_9/final add baseline_`i'.dta", clear
			rename ticket_score bobot
			gen batch = `i'
			tempfile `i'
			qui save ``i''
		restore
		append using ``i''
	}

* Check uniqueness
	gisid anon_id4 batch
	gsort anon_id4 batch
	tab batch

	/*----------------------------------------------------*/
	            /* Section: Check Data */
	/*----------------------------------------------------*/

* check aaa vars
	qui foreach var of varlist aaa? {
	  noi di as text "Checking `var'..."
	  gsort anon_id4
	  by anon_id4: gegen `var'max = max(`var') if batch <= 11
	  by anon_id4: gegen `var'min = min(`var') if batch <= 11
	  summ `var'max `var'min
	  tab `var'max `var'min
	  cap noi assert `var'max == `var'min
	  if _rc == 0 {
	    drop `var'max `var'min
	  }
	  else {
	    noi di "There are changes in `var'. Max and min left in memory."
	  }
	}
	cap drop aaa*max aaa*min

	qui foreach var of varlist aaa2? {
	  noi di as text "Checking `var'..."
	  gsort anon_id4
	  by anon_id4: gegen `var'max = max(`var') if batch > 11
	  by anon_id4: gegen `var'min = min(`var') if batch > 11
	  summ `var'max `var'min
	  tab `var'max `var'min
	  cap noi assert `var'max == `var'min
	  if _rc == 0 {
	    drop `var'max `var'min
	  }
	  else {
	    noi di "There are changes in `var'. Max and min left in memory."
	  }
	}
	cap drop aaa*max aaa*min

	qui foreach var of varlist aaa3? {
	  noi di as text "Checking `var'..."
	  gsort anon_id4
	  by anon_id4: gegen `var'max = max(`var') if batch > 17
	  by anon_id4: gegen `var'min = min(`var') if batch > 17
	  summ `var'max `var'min
	  tab `var'max `var'min
	  cap noi assert `var'max == `var'min
	  if _rc == 0 {
	    drop `var'max `var'min
	  }
	  else {
	    noi di "There are changes in `var'. Max and min left in memory."
	  }
	}
	cap drop aaa*max aaa*min

* check test score
	gen test_score_round = round(final_test_score, 1)
	gsort anon_id4 test_score_round
	by anon_id4 (test_score_round): assert test_score_round[1] == test_score_round[_N] | test_score_round[_N] == .

	drop test_score_round

* check demographic vars are constant
	foreach var of varlist year_dob anon_month_dob gender education anon_hh_id anon_prov_id {
		gsort anon_id4 `var'
		by anon_id4 (`var'): gen tag_`var' = `var'[1] != `var'[_N]
		tab tag_`var'
	}

	drop tag_*

* check number of individuals applying from same HH
* Note: one household with 273 individuals applying?
	gsort anon_hh_id batch
	by anon_hh_id batch: gen num_apply_in_batch_hh = _N if anon_prov_id != .
	gegen unique_applicants_hh = tag(anon_hh_id anon_id4)
	by anon_hh_id: gegen total_apply_hh = total(unique_applicants_hh)

	preserve
	gsort anon_hh_id batch
	by anon_hh_id batch: gen ord = _n
	drop if ord > 1
	tab num_apply_in_batch_hh

	forval i = 1 / 22 {
		di "Batch: `i':"
		tab num_apply_in_batch_hh if batch == `i'
	}

	by anon_hh_id : replace ord = _n
	drop if ord > 1
	tab total_apply_hh
	restore

* is this the first batch individual applies in?
	gsort anon_id4
	by anon_id4: gegen first_apply_batch = min(batch)

	/*----------------------------------------------------*/
	            /* Section: Reshape */
	/*----------------------------------------------------*/

	gsort anon_id4

* Prep to reshape wide
	gen applied = 1
	rename aaa* aaa*_
	compress

* Reshape wide
* Add to list in future if other variables are needed
	local varlist applied ///
								anon_hh_id ///
								first_apply_batch ///
								has_passed_current_batch ///
								bobot ///
								year_dob ///
								anon_month_dob ///
								gender ///
								education ///
								anon_prov_id ///
								final_test_score ///
								aaa*_

	keep `varlist' anon_id4 batch

	greshape wide `varlist ', ///
		i(anon_id4) j(batch) nochecks benchmark

* Clean up data
* keep only first copy of duplicated variables
	foreach var in first_apply_batch year_dob anon_month_dob gender ///
		final_test_score education aaa1_ aaa2_ aaa3_ aaa4_ aaa5_ aaa6_ aaa7_ ///
		aaa8_ aaa9_ aaa20_ aaa21_ aaa22_ aaa23_ aaa24_ aaa25_ aaa26_ aaa27_ ///
		aaa31_ aaa32_ aaa33_ aaa34_ aaa35_ aaa36_ aaa37_ aaa38_ {
		  egen `var' = rowfirst(`var'*)
		  drop `var'?*
	}

	gen anon_prov_id = .
	gen anon_hh_id = .
	gen win_batch = .

	qui forval i = 1 / 22 {
		replace anon_prov_id = anon_prov_id`i' if first_apply_batch == `i'
		replace anon_hh_id = anon_hh_id`i' if first_apply_batch == `i'
		replace win_batch = `i' if has_passed_current_batch`i' == 1
	}

	drop anon_hh_id?* anon_prov_id?*

	rename aaa?_ aaa?
	rename aaa*_ aaa*

* Save raw merged reshaped data
datasignature 
if "`r(datasignature)'" == "427940:101(107625):247160772:3462463568" {
   save "$KP_deid_admin/Clean/pmo_b1-22_raw_add_vars_baseline_wide.dta", replace
      }
else {
   di as err "Careful, your machine produces a different dataset"
   stop
		}


	cap log close


// DONE
